import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.style as st
st.use("ggplot")


df=pd.read_csv("newJordan.csv",usecols=["phone","religion","birthday","first","gender","lang","email"])


df.drop_duplicates(inplace=True)


df.groupby("gender")["gender"].count()

gender
female     977629
male      2053416
Name: gender, dtype: int64


df.groupby("gender")["gender"].count().plot(kind="bar",color=("purple","blue"),edgecolor="black")

<AxesSubplot:xlabel='gender'>


df.phone=df.phone.astype(str)


df.phone.count()

3099842


Orange=df[df.phone.str.startswith("96277")].phone.count()
Zain=df[df.phone.str.startswith("96279")].phone.count()
Umniah=df[df.phone.str.startswith("96278")].phone.count()


df.loc[df.phone.str.startswith("96279"),"phone"]="Zain"
df.loc[df.phone.str.startswith("96277"),"phone"]="Orange"
df.loc[df.phone.str.startswith("96278"),"phone"]="Umniah"


# df.replace(df[df.phone.str.startswith("96279")].phone,"Zain")


df[~df.phone.str.startswith(tuple(["96277","96279","96278"]))].phone.count()

3099842


df=df.replace({"religion":[".*(uman|UMAN).*"]},"Humanism",regex=True)


Orange+Umniah+Zain

3099797


plt.gcf().set_size_inches(8,8)
plt.pie([Zain,Umniah,Orange],labels=["Zain","Orange","Umniah"],colors=["blue","orange","green"],shadow=True,autopct="%1.1f%%")
plt.legend()

<matplotlib.legend.Legend at 0x7fecf722f4c0>


df.religion.value_counts()

الاسلام                                                    6531
Muslim                                                     5257
Muslim - Sunni                                             4168
Islam                                                      4113
مسلم                                                       3580
                                                           ... 
مسلم وراسي مرفوع                                              1
Other (كلي جروح                                               1
Muslim - Sunni  █████████ 100% ... I <3 God                   1
Other (معتقداتي انسانية                                       1
المسيحيه (في البداء كان الكلمه ولكلمه كان عند الله وكلم       1
Name: religion, Length: 11638, dtype: int64


df=df.replace({"religion":[".*(سن|Islam|isl|Musulman|sl|Ислам|SL|Sl|سل|مح|محم).*"]},"Muslim",regex=True)


df=df.replace({"religion":[".*(Христианство|sih|seh|cristian|atholic|ch|CH|Ch|مسي).*"]},"Christian",regex=True)


df=df.replace({"religion":[".*(God|god|GOD|Other|other).*"]},"Other",regex=True)


df=df.replace({"religion":[".*(الله|م|ﺍﻟ|isalam|ﺍﻟﻠﻪ|Isalm|allah|ﻣﺴﻠﻢ|Allah|isalm|لل|الا).*"]},"Muslim",regex=True)


df=df.replace({"religion":[".*(indu|udd).*"]},"Hindu & Buddhist",regex=True)


df=df.replace({"religion":[".*(uman|UMAN).*"]},"Humanism",regex=True)


df=df.replace({"religion":[".*(ruze|urzi).*"]},"Druze",regex=True)


df=df.replace({"religion":[".*(ecular|gnostic|Ath|None|none|أدري|no|No|NON|non|NO).*"]},"Non believer",regex=True)


df.religion.value_counts()

Muslim          55734
Christian        1751
Other             588
Non believer      140
Humanism           42
                ...  
durze ()            1
Memes ()            1
00                  1
Neutral ()          1
Metal \m/           1
Name: religion, Length: 424, dtype: int64


df.religion.value_counts(normalize=True)[:10]*100

Muslim                              94.837326
Christian                            2.979513
Other                                1.000545
Non believer                         0.238225
Humanism                             0.071467
Hindu & Buddhist                     0.035734
Druze                                0.030629
What are your religious beliefs?     0.010210
ا                                    0.010210
musilm ()                            0.006806
Name: religion, dtype: float64


df[df.religion.notna()].religion.value_counts()[0:7].plot(figsize=(10,5),kind="bar")

<AxesSubplot:>


df[df.religion.notna()].religion.value_counts()[0:7].plot(figsize=(10,5),kind="bar",color="blue",edgecolor="black")
plt.yscale("log")


df["lang"].value_counts()[0:20].plot(figsize=(10,5),kind="bar")

<AxesSubplot:>


df["lang"].value_counts(normalize=True)[0:4].plot(figsize=(10,5),kind="bar",color="blue",edgecolor="black")
plt.legend()
plt.yscale("log")


df["lang"].value_counts()[4:20].plot(figsize=(10,5),kind="bar",edgecolor="black")
plt.legend()

<matplotlib.legend.Legend at 0x7fecbb916dc0>


df["first"].value_counts()[:20].plot(figsize=(10,10),kind="pie")
# df.religion.value_counts()

<AxesSubplot:ylabel='first'>


dateSeries=df.birthday.dropna()


one=dateSeries[~dateSeries.str.contains(r"\d\d\d")]+"/2020"


two=dateSeries[dateSeries.str.contains(r"\d\d\d")]


dates=pd.concat([two,one],axis=0)


dates=pd.to_datetime(dates)


dates.dt.month.value_counts().sort_index().plot(figsize=(10,5),kind="bar",color="cyan",edgecolor="black")

<AxesSubplot:>


dates.dt.year.value_counts()[1:].sort_index().plot(figsize=(20,10),kind="bar",color="b")

<AxesSubplot:>


two=pd.to_datetime(two)


dat=two.dt.month.groupby(two.dt.day).value_counts().unstack()


fig,ax=plt.subplots(figsize=(12,18))
sns.heatmap(dat,cmap="RdBu_r",vmin=50,annot=True,fmt=".0f",linewidths=1,linecolor="black")
plt.ylabel("day")
plt.xlabel("month")
ax.xaxis.set_ticks_position("top")
ax.xaxis.set_label_position("top")


emails=df.email.dropna()


provider=emails.str.extract(r"@(.*?)\.",expand=False).value_counts()[:10]


fig,ax=plt.subplots(figsize=(12,5))
# plt.text(20,20,"eggs",ha="right")
# provider.plot(,kind="bar",color="black")
# plt.xticks(rotation=90)
bars=ax.barh(provider.index,provider.values,color="yellow",edgecolor="black")
ax.bar_label(bars,padding=5)
# ax.invert_yaxis()
plt.title("count of top 10 email providers")

Text(0.5, 1.0, 'count of top 10 email providers')


df.loc[df.email.str.contains(r"windowslive",na=False),"email"]="windowslive"
df.loc[df.email.str.contains(r"yahoo",na=False),"email"]="yahoo"


for i in provider.index:
    df.loc[df.email.str.contains(f"@{i}\.",na=False),"email"]=f"{i}"


df.email.value_counts()[:10]

yahoo          12922
hotmail         5983
gmail           3721
live             419
ymail            208
outlook          181
windowslive      119
mail              79
icloud            36
rocketmail        34
Name: email, dtype: int64


provider

yahoo          12920
hotmail         5983
gmail           3723
live             419
ymail            208
outlook          181
windowslive      119
mail              79
icloud            36
rocketmail        34
Name: email, dtype: int64

df


# ddata=newd[["religion","birthday","gender","lang","first"]]
# ddata.count()


# ddata.to_csv("modifiedJO.csv",index=False)


# df.to_csv("data.csv",index=False)


# dates.to_csv("2.csv",index=False)


# provider.to_csv("emails.csv")

	email	phone	religion	birthday	first	gender	lang
0	NaN	Zain	Muslim	02/23/1986	احمد ظاهر	male	en_GB
1	NaN	Zain	NaN	NaN	Ahmad	male	en_US
2	NaN	Zain	NaN	06/17	AlHawi	male	en_GB
3	NaN	Zain	NaN	NaN	Abed	male	ar_AR
4	NaN	Zain	NaN	NaN	Hassan	male	ar_AR
...	...	...	...	...	...	...	...
3105977	NaN	96232312412	Muslim	02/21/1990	سالم	male	ar_AR
3105978	y.asaidat@pra.gov.jo	96232156044	NaN	08/20/1975	Yaseen	male	ar_AR
3105979	NaN	96227381601	Muslim	NaN	بتهمنا	male	ar_AR
3105980	yahoo	96227243300	NaN	NaN	Kamal	male	ar_AR
3105981	NaN	96226210443	Muslim	10/02/1984	وليد	male	ar_AR